/*
 * Copyright  2017 NXP
 * All rights reserved.
 *
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include <stdio.h>

#include "fsl_common.h"
#include "board.h"
#include "pin_mux.h"

#include "timer.h"

#include "arm_math.h"
#include "arm_const_structs.h"

#include "fsl_powerquad.h"

/*******************************************************************************
 * Definitions
 ******************************************************************************/
#define FFT1024_TABLE_MULTIPLIER  1000
#define FFT1024_ENABLE_PRINT_RESULT 0

/*******************************************************************************
 * Prototypes
 ******************************************************************************/

/*******************************************************************************
 * Variables
 ******************************************************************************/
uint32_t app_timer_ticks;

extern int32_t fft1024_cosTable[1024];
extern int32_t fft1024_sinTable[1024];
extern q31_t   fft1024_selftestData[2048]; /* 1024 complex numbers. */

void fft1024_arm_func1(void);
void fft1024_arm_func2(void);
void fft1024_powerquad_func1(void);
void fft1024_powerquad_func2(void);
void fft1024_powerquad_func3(void);

typedef void (* func_0_t)(void);
func_0_t app_funcs[] =
{
    fft1024_arm_func1,
    fft1024_arm_func2,
    fft1024_powerquad_func1,
    fft1024_powerquad_func2,
    fft1024_powerquad_func3
};

static q31_t app_fft1024_part1Data[2048];
static q31_t app_fft1024_part2Data[2048];
static q31_t app_fft1024_part2Data2[2048];
static q31_t app_fft1024_result[2048];
static q31_t app_fft1024_input[2048];

/*******************************************************************************
 * Code
 ******************************************************************************/


/*!
 * @brief Main function
 */
int main(void)
{
    board_init();
    printf("powerquad fft 1024.\r\n");

    /* systick. */
    timer_init();

    /* powerquad. */
    PQ_Init(POWERQUAD);

#if 0
    timer_start();
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_part1Data[i] = i;
    }
    timer_stop(app_timer_ticks);
    printf("app_timer_ticks: %d ticks, %d us\r\n", app_timer_ticks, app_timer_ticks/ 150u);
#endif

    while (1)
    {
    	for (uint32_t i = 0u; i < sizeof(app_funcs)/sizeof(app_funcs[0]); i++)
        {
            char ch = getchar(); putchar(ch);
            (app_funcs[i])();
        }
    }
}

/* 使用CMSIS-DSP中的1024算子, 一个函数搞定1024个点的FFT. */
void fft1024_arm_func1(void)
{
    printf("\r\n%s()\r\n", __func__);

    memset(app_fft1024_input     , 0, sizeof(app_fft1024_input));
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_input[2*i] = fft1024_selftestData[2*i] >> 5u; /* only 27bit available for powerquad fft. */
    }

    timer_start();

    arm_cfft_q31(&arm_cfft_sR_q31_len1024, app_fft1024_input, 0, 1);

    timer_stop(app_timer_ticks);
    printf(" * total time: %d us\r\n", app_timer_ticks/ 150u);

#if FFT1024_ENABLE_PRINT_RESULT
    printf("\r\nResult:\r\n");
    for (uint32_t i = 0u; i < 512u; i++) /* only first half numbers are OK. */
    //for (uint32_t i = 0u; i < 12u; i++) /* only first half numbers are OK. */
    {
        printf("%4d : %8d, %8d.\r\n", i, app_fft1024_input[(2 * i)], app_fft1024_input[(2 * i) + 1]);
    }
#endif /* FFT1024_ENABLE_PRINT_RESULT */
}

/*
 * 不要纠结FFT计算结果的一致性, FFT主要用于频谱分析, 主要不同频点能量的相对关系.
 * 在不同算法得到的结果中, 相对关系总是正确的, 只是不同算法的绝对值可能有倍数关系.
 */
/* 将一个1024点的FFT, 分解成两个512点的FFT, 再配合一组复数的乘法和加法(twiddle).
 * 这是后续使用PowerQuad实现的算法基础.
 */
void fft1024_arm_func2(void)
{
    printf("\r\n%s()\r\n", __func__);
    uint32_t total_us = 0u;

    /* clear the workspace variables. */
    memset(app_fft1024_part1Data , 0, sizeof(app_fft1024_part1Data));
    memset(app_fft1024_part2Data , 0, sizeof(app_fft1024_part2Data));
    memset(app_fft1024_part2Data2, 0, sizeof(app_fft1024_part2Data2));
    memset(app_fft1024_result    , 0, sizeof(app_fft1024_result));
    memset(app_fft1024_input     , 0, sizeof(app_fft1024_input));
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_input[2*i] = fft1024_selftestData[2*i] >> 5u; /* only 27bit available for powerquad fft. */
    }

    timer_start();
    // Split omitting imaginary part.
    /* 将原始数据拆分成两部分:
     * - app_fft1024_part1Data[]存放第0, 2, 4, ...个数.
     * - app_fft1024_part2Data[]存放第1, 3, 5, ...个数.
     * 此处特别注意, 这三个数组表示的数都是复数, 因此要跳着取数和写数,
     * 同时仅在实部有有效数据, 虚部为0, 因此读数和写数的时候又跳过了虚部的部分.
     */
    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part1Data[(2 * i)] = app_fft1024_input[(4 * i)];
        app_fft1024_part2Data[(2 * i)] = app_fft1024_input[(4 * i) + 2];
    }

    timer_stop(app_timer_ticks);
    printf(" - split in two vectors [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    // Calculate 2 FFTs.
    /* 原地计算两个较小的FFT. */
    arm_cfft_q31(&arm_cfft_sR_q31_len512, app_fft1024_part1Data, 0, 1);
    arm_cfft_q31(&arm_cfft_sR_q31_len512, app_fft1024_part2Data, 0, 1);

    timer_stop(app_timer_ticks);
    printf(" - two fft [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

#if 0
  // Double the vectors: not necessary because FFT symmetrie not used.
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_part1Data[1024 + i] = app_fft1024_part1Data[i];
        app_fft1024_part2Data[1024 + i] = app_fft1024_part2Data[i];
    }
#endif

    timer_start();

    // Complex multiplication: (a + jb) * (c + jd) = (ac - bd) + j(bc + ad)
    // Multiply part2 with cosTable as real and sinTable as imaginary part.
    /* 复数乘法, part2部分再乘以e^-jw. */
    for (uint32_t i = 0u; i < 512u; i++)
    //for (uint32_t i = 0u; i < 1024u; i++)
    {
        // RE : ac - bd
        app_fft1024_part2Data2[(2 * i)]     = (   (app_fft1024_part2Data[(2 * i)] * fft1024_cosTable[i])
                                                - (app_fft1024_part2Data[(2 * i) + 1] * fft1024_sinTable[i]) ) / FFT1024_TABLE_MULTIPLIER;
        // IM : bc + ad
        app_fft1024_part2Data2[(2 * i) + 1] = (   (app_fft1024_part2Data[(2 * i) + 1] * fft1024_cosTable[i])
                                                + (app_fft1024_part2Data[(2 * i)] * fft1024_sinTable[i])     ) / FFT1024_TABLE_MULTIPLIER;
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    // Complex addition of the two parts.
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_result[i] = app_fft1024_part1Data[i] + app_fft1024_part2Data2[i];
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex addition [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);
    printf(" * total time: %d us\r\n", total_us);

#if FFT1024_ENABLE_PRINT_RESULT
    printf("\r\nResult:\r\n");
    for (uint32_t i = 0u; i < 512u; i++) /* only first half numbers are OK. */
    //for (uint32_t i = 0u; i < 12u; i++) /* only first half numbers are OK. */
    {
        printf("%4d : %8d, %8d.\r\n", i, app_fft1024_result[(2 * i)], app_fft1024_result[(2 * i) + 1]);
    }
#endif /* FFT1024_ENABLE_PRINT_RESULT. */
}

/* 使用PowerQuad加速FFT部分, 复数向量的乘法和加法在PowerQuad硬件上没有原生支持, 暂用纯软的做法实现. */
void fft1024_powerquad_func1(void)
{
    printf("\r\n%s()\r\n", __func__);
    uint32_t total_us = 0u;

    /* clear the workspace variables. */
    memset(app_fft1024_part1Data , 0, sizeof(app_fft1024_part1Data));
    memset(app_fft1024_part2Data , 0, sizeof(app_fft1024_part2Data));
    memset(app_fft1024_part2Data2, 0, sizeof(app_fft1024_part2Data2));
    memset(app_fft1024_result    , 0, sizeof(app_fft1024_result));
    memset(app_fft1024_input     , 0, sizeof(app_fft1024_input));
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_input[2*i] = fft1024_selftestData[2*i] >> 5u; /* only 27bit available for powerquad fft. */
    }

    timer_start();

    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part1Data[(2 * i)] = app_fft1024_input[(4 * i)];
        app_fft1024_part2Data[(2 * i)] = app_fft1024_input[(4 * i) + 2];
    }

    timer_stop(app_timer_ticks);
    printf(" - split in two vectors [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();
    pq_config_t pq_config;
    pq_config.inputAFormat = kPQ_32Bit;
    pq_config.inputAPrescale = 0;
    pq_config.inputBFormat = kPQ_32Bit;
    pq_config.inputBPrescale = 0;
    pq_config.outputFormat = kPQ_32Bit;
    pq_config.outputPrescale = 0;
    pq_config.tmpFormat = kPQ_32Bit;
    pq_config.tmpPrescale = 0;
    pq_config.machineFormat = kPQ_32Bit;
    pq_config.tmpBase = (uint32_t *)0xE0000000; /* internal memory. */
    PQ_SetConfig(POWERQUAD, &pq_config);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part1Data, app_fft1024_part1Data);
    PQ_WaitDone(POWERQUAD);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part2Data, app_fft1024_part2Data);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - two fft [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    for (uint32_t i = 0u; i < 512u; i++)
    //for (uint32_t i = 0u; i < 1024u; i++)
    {
        // RE : ac - bd
        app_fft1024_part2Data2[(2 * i)]     = (   (app_fft1024_part2Data[(2 * i)] * fft1024_cosTable[i])
                                                - (app_fft1024_part2Data[(2 * i) + 1] * fft1024_sinTable[i]) ) / FFT1024_TABLE_MULTIPLIER;
        // IM : bc + ad
        app_fft1024_part2Data2[(2 * i) + 1] = (   (app_fft1024_part2Data[(2 * i)] * fft1024_sinTable[i])
                                                + (app_fft1024_part2Data[(2 * i) + 1] * fft1024_cosTable[i]) ) / FFT1024_TABLE_MULTIPLIER;
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();
    // Complex addition of the two parts.
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_result[i] = app_fft1024_part1Data[i] + app_fft1024_part2Data2[i];
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex addition [ARM]: %d us\r\n", app_timer_ticks/ 150u);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    printf(" * total time: %d us\r\n", total_us);

#if FFT1024_ENABLE_PRINT_RESULT
    printf("\r\nResult:\r\n");
    for (uint32_t i = 0u; i < 512u; i++) /* only first half numbers are OK. */
    //for (uint32_t i = 0u; i < 12u; i++) /* only first half numbers are OK. */
    {
        printf("%4d : %8d, %8d.\r\n", i, app_fft1024_result[(2 * i)], app_fft1024_result[(2 * i) + 1]);
    }
#endif
}

/* 使用PowerQuad加速FFT部分和复数加法部分, 复数向量乘法部分仍使用软件实现. */
void fft1024_powerquad_func2(void)
{
    printf("\r\n%s()\r\n", __func__);
    uint32_t total_us = 0u;

    /* clear the workspace variables. */
    memset(app_fft1024_part1Data , 0, sizeof(app_fft1024_part1Data));
    memset(app_fft1024_part2Data , 0, sizeof(app_fft1024_part2Data));
    memset(app_fft1024_part2Data2, 0, sizeof(app_fft1024_part2Data2));
    memset(app_fft1024_result    , 0, sizeof(app_fft1024_result));
    memset(app_fft1024_input     , 0, sizeof(app_fft1024_input));
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_input[2*i] = fft1024_selftestData[2*i] >> 5u; /* only 27bit available for powerquad fft. */
    }

    timer_start();
    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part1Data[(2 * i)] = app_fft1024_input[(4 * i)];
        app_fft1024_part2Data[(2 * i)] = app_fft1024_input[(4 * i) + 2];
    }
    timer_stop(app_timer_ticks);
    printf(" - split in two vectors [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    pq_config_t pq_config;
    pq_config.inputAFormat = kPQ_32Bit;
    pq_config.inputAPrescale = 0;
    pq_config.inputBFormat = kPQ_32Bit;
    pq_config.inputBPrescale = 0;
    pq_config.outputFormat = kPQ_32Bit;
    pq_config.outputPrescale = 0;
    pq_config.tmpFormat = kPQ_32Bit;
    pq_config.tmpPrescale = 0;
    pq_config.machineFormat = kPQ_32Bit;
    pq_config.tmpBase = (uint32_t *)0xE0000000; /* internal memory. */
    PQ_SetConfig(POWERQUAD, &pq_config);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part1Data, app_fft1024_part1Data);
    PQ_WaitDone(POWERQUAD);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part2Data, app_fft1024_part2Data);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - two fft [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    for (uint32_t i = 0u; i < 512u; i++)
    //for (uint32_t i = 0u; i < 1024u; i++)
    {
        // RE : ac - bd
        app_fft1024_part2Data2[(2 * i)]     = (   (app_fft1024_part2Data[(2 * i)] * fft1024_cosTable[i])
                                                - (app_fft1024_part2Data[(2 * i) + 1] * fft1024_sinTable[i]) ) / FFT1024_TABLE_MULTIPLIER;
        // IM : bc + ad
        app_fft1024_part2Data2[(2 * i) + 1] = (   (app_fft1024_part2Data[(2 * i)] * fft1024_sinTable[i])
                                                + (app_fft1024_part2Data[(2 * i) + 1] * fft1024_cosTable[i]) ) / FFT1024_TABLE_MULTIPLIER;
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    pq_config.machineFormat = kPQ_Float;
    PQ_SetConfig(POWERQUAD, &pq_config);
    uint32_t length = POWERQUAD_MAKE_MATRIX_LEN(16, 16, 16);
    // Complex addition of the two parts.
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[0], &app_fft1024_part2Data2[0], &app_fft1024_result[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[256], &app_fft1024_part2Data2[256], &app_fft1024_result[256]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[512], &app_fft1024_part2Data2[512], &app_fft1024_result[512]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[768], &app_fft1024_part2Data2[768], &app_fft1024_result[768]);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - vector complex addition [PQ]: %d us\r\n", app_timer_ticks/ 150u);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    printf(" * total time: %d us\r\n", total_us);

#if FFT1024_ENABLE_PRINT_RESULT
    printf("\r\nResult:\r\n");
    for (uint32_t i = 0u; i < 512u; i++) /* only first half numbers are OK. */
    //for (uint32_t i = 0u; i < 12u; i++) /* only first half numbers are OK. */
    {
        printf("%4d : %8d, %8d.\r\n", i, app_fft1024_result[(2 * i)], app_fft1024_result[(2 * i) + 1]);
    }
#endif
}

q31_t app_fft1024_part2_in_rel[512];
q31_t app_fft1024_part2_in_img[512];
q31_t app_fft1024_part2_out_rel[512];
q31_t app_fft1024_part2_out_img[512];
q31_t app_fft1024_part2_out2_rel[512];
q31_t app_fft1024_part2_out2_img[512];

/* 终极大招, 使用PowerQuad加速FFT, 复数加法, 配合大量的内存实现复数乘法. */
void fft1024_powerquad_func3(void)
{
    printf("\r\n%s()\r\n", __func__);
    uint32_t total_us = 0u;

    /* clear the workspace variables. */
    memset(app_fft1024_part1Data , 0, sizeof(app_fft1024_part1Data));
    memset(app_fft1024_part2Data , 0, sizeof(app_fft1024_part2Data));
    memset(app_fft1024_part2Data2, 0, sizeof(app_fft1024_part2Data2));
    memset(app_fft1024_result    , 0, sizeof(app_fft1024_result));
    memset(app_fft1024_input     , 0, sizeof(app_fft1024_input));
    for (uint32_t i = 0u; i < 1024u; i++)
    {
        app_fft1024_input[2*i] = fft1024_selftestData[2*i] >> 5u; /* only 27bit available for powerquad fft. */
    }

    timer_start();

    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part1Data[(2 * i)] = app_fft1024_input[(4 * i)];
        app_fft1024_part2Data[(2 * i)] = app_fft1024_input[(4 * i) + 2];
    }

    timer_stop(app_timer_ticks);
    printf(" - split in two vectors [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    pq_config_t pq_config;
    pq_config.inputAFormat = kPQ_32Bit;
    pq_config.inputAPrescale = 0;
    pq_config.inputBFormat = kPQ_32Bit;
    pq_config.inputBPrescale = 0;
    pq_config.outputFormat = kPQ_32Bit;
    pq_config.outputPrescale = 0;
    pq_config.tmpFormat = kPQ_32Bit;
    pq_config.tmpPrescale = 0;
    pq_config.machineFormat = kPQ_32Bit;
    pq_config.tmpBase = (uint32_t *)0xE0000000; /* internal memory. */
    PQ_SetConfig(POWERQUAD, &pq_config);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part1Data, app_fft1024_part1Data);
    PQ_WaitDone(POWERQUAD);
    PQ_TransformCFFT(POWERQUAD, 512, app_fft1024_part2Data, app_fft1024_part2Data);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - two fft [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    /* split into real part and img part. */
    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part2_in_rel[i] = app_fft1024_part2Data[2u*i];
        app_fft1024_part2_in_img[i] = app_fft1024_part2Data[2u*i+1u];
    }

    timer_stop(app_timer_ticks);
    printf(" - split fft2 vector [ARM]: %d\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    /* the following work are using matrix engine. */
    pq_config.machineFormat = kPQ_Float;
    PQ_SetConfig(POWERQUAD, &pq_config);
    uint32_t length = POWERQUAD_MAKE_MATRIX_LEN(16, 16, 16);

    /* vector complex multiplication. */
    /* for real part of output. */
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_rel[0], &fft1024_cosTable[0], &app_fft1024_part2_out_rel[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_rel[256], &fft1024_cosTable[256], &app_fft1024_part2_out_rel[256]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_img[0], &fft1024_sinTable[0], &app_fft1024_part2_out2_rel[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_img[256], &fft1024_sinTable[256], &app_fft1024_part2_out2_rel[256]);
    PQ_WaitDone(POWERQUAD);

    PQ_MatrixSubtraction(POWERQUAD, length, &app_fft1024_part2_out_rel[0], &app_fft1024_part2_out2_rel[0], &app_fft1024_part2_out_rel[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixSubtraction(POWERQUAD, length, &app_fft1024_part2_out_rel[256], &app_fft1024_part2_out2_rel[256], &app_fft1024_part2_out_rel[256]);
    PQ_WaitDone(POWERQUAD);

    /* for img part of output. */
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_rel[0], &fft1024_sinTable[0], &app_fft1024_part2_out_img[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_rel[256], &fft1024_sinTable[256], &app_fft1024_part2_out_img[256]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_img[0], &fft1024_cosTable[0], &app_fft1024_part2_out2_img[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixProduct(POWERQUAD, length, &app_fft1024_part2_in_img[256], &fft1024_cosTable[256], &app_fft1024_part2_out2_img[256]);
    PQ_WaitDone(POWERQUAD);

    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part2_out_img[0], &app_fft1024_part2_out2_img[0], &app_fft1024_part2_out_img[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part2_out_img[256], &app_fft1024_part2_out2_img[256], &app_fft1024_part2_out_img[256]);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication - part i [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    /* assemble the result. */
    for (uint32_t i = 0u; i < 512u; i++)
    {
        app_fft1024_part2Data2[2u*i]    = app_fft1024_part2_out_rel[i];
        app_fft1024_part2Data2[2u*i+1u] = app_fft1024_part2_out_img[i];
    }

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication - part ii [ARM]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    PQ_MatrixScale(POWERQUAD, length, 0.001f, &app_fft1024_part2Data2[0], &app_fft1024_part2Data2[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixScale(POWERQUAD, length, 0.001f, &app_fft1024_part2Data2[256], &app_fft1024_part2Data2[256]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixScale(POWERQUAD, length, 0.001f, &app_fft1024_part2Data2[512], &app_fft1024_part2Data2[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixScale(POWERQUAD, length, 0.001f, &app_fft1024_part2Data2[768], &app_fft1024_part2Data2[256]);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - vector complex multiplication - part iii [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    timer_start();

    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[0], &app_fft1024_part2Data2[0], &app_fft1024_result[0]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[256], &app_fft1024_part2Data2[256], &app_fft1024_result[256]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[512], &app_fft1024_part2Data2[512], &app_fft1024_result[512]);
    PQ_WaitDone(POWERQUAD);
    PQ_MatrixAddition(POWERQUAD, length, &app_fft1024_part1Data[768], &app_fft1024_part2Data2[768], &app_fft1024_result[768]);
    PQ_WaitDone(POWERQUAD);

    timer_stop(app_timer_ticks);
    printf(" - vector complex addition [PQ]: %d us\r\n", app_timer_ticks / TIMER_TICKS_PER_US);
    total_us += (app_timer_ticks / TIMER_TICKS_PER_US);

    printf(" * total time: %d us\r\n", total_us);

#if FFT1024_ENABLE_PRINT_RESULT
    printf("\r\nResult:\r\n");
    for (uint32_t i = 0u; i < 512u; i++) /* only first half numbers are OK. */
    //for (uint32_t i = 0u; i < 12u; i++) /* only first half numbers are OK. */
    {
        printf("%4d : %8d, %8d.\r\n", i, app_fft1024_result[(2 * i)], app_fft1024_result[(2 * i) + 1]);
    }
#endif /* FFT1024_ENABLE_PRINT_RESULT */
}

/* EOF. */

